home *** CD-ROM | disk | FTP | other *** search
Wrap
/* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */ /************************************************************************** * defs.h: contains definitions for our static/dictionary based * * compression scheme that is tailored for very fast search. * **************************************************************************/ #ifndef _DEFS_H_ #define _DEFS_H_ #include <sys/types.h> #include <sys/stat.h> #include <ctype.h> #include "glimpse.h" #ifndef S_ISREG #define S_ISREG(mode) (0100000&(mode)) #endif #ifndef S_ISDIR #define S_ISDIR(mode) (0040000&(mode)) #endif #undef COMP_SUFFIX #undef DEF_STRING_FILE #undef DEF_HASH_FILE #undef DEF_FREQ_FILE #undef SIGNATURE_LEN #define MIN_WORD_LEN 1 /* smaller words are not indexed: heuristics like special_texts etc. must be used: verbatim is good enough */ #define AVG_WORD_LEN 12 /* average word length is 8-9 including '\0': have safety margin */ #define HASH_TABLE_SIZE MAX_64K_HASH #define SMALL_HASH_TABLE_SIZE MAX_4K_HASH #define HASH_ENTRY_SIZE 32 /* hash-file stores: name of len=24, a 5 digit int, a ' ' + a '\n' = 31 bytes + some padding once in a while */ #define DEF_BLOCKSIZE 4096 /* I/O unit size = OS page size */ #define MIN_BLOCKSIZE 512 /* granularity for above and below */ #define HASH_FILE_BLOCKS (HASH_TABLE_SIZE * HASH_ENTRY_SIZE / MIN_BLOCKSIZE) #define STRING_FILE_BLOCKS (HASH_TABLE_SIZE * MAX_WORD_LEN / MIN_BLOCKSIZE) #define MAX_SPECIAL_CHARS 32 /* Maximum # of special characters used during compress */ #define DEF_SPECIAL_WORDS 32 /* Special words for which 1B codes are reserved */ #define COMP_ATLEAST 10 /* At least 10% compression is needed */ #define COMP_SUFFIX ".CZ" /* Common suffix used for all compressed files: IT INCLUDES THE '.' !!! */ #define DEF_INDEX_FILE INDEX_FILE /* same as glimpse's */ #define DEF_STRING_FILE ".glimpse_uncompress" #define DEF_HASH_FILE ".glimpse_compress" #define DEF_FREQ_FILE ".glimpse_quick" #define DEF_THRESHOLD 16 /* 256? default for min bytes to be coverd before storing in hash table */ #define MAX_THRESHOLD 65535 /* MAX_WORDS*MAX_THRESHOLD must be < 2**32 - 1 = maxoffset = maxdiskspace = integer */ #define MAX_LSB 254 /* 256 - |{'\0', '\n'}| */ #define DEF_MAX_WORDS (MAX_LSB*MAX_LSB) #define SAMPLE_SIZE 8192 /* amount of data read to determine file-type: NOT CALLED FOR STDIN! */ #define SIGNATURE_LEN 16 /* to avoid calling strlen: including \0! */ typedef struct _hash_entry { struct _hash_entry *next; char *word; /* string itself */ union { int offset; /* offset into the dictionary file: used only while building compress's dict from glimpse's dict */ struct { short freq; /* number of times the word occurs -- provided it is in the dictionary */ short index; /* index into the string table */ } attribute; /* once freq > THRESHOLD, its just an index into the string table: used only while compressing a file */ } val; } hash_entry; /* * The total number of special characters (1..4) CANNOT exceed MAX_SPECIAL_CHARS. * The arrangement is as follows: * 1. SPECIAL_TEXTS * 2. SPECIAL_SEPARATORS * 3. SPECIAL_DELIMITERS * 4. VERBATIM * 5. SPECIAL_WORDS * Any rearrangement of these can be done provided the BEGIN/END values * are defined properly: the NUMs remain the same. */ #define BEGIN_SPECIAL_CHARS 1 /* character 0 is never a part of any code */ #define END_SPECIAL_CHARS 30 /* Not including begin/end verbatim */ /* Special delimiters are text-sequences which can come after a word instead of a blank: this is a subset of the above with '\n' and '\t' */ #define EASY_NUM_SPECIAL_DELIMITERS 8 /* numbered from 1 .. 8 */ #define HARD_NUM_SPECIAL_DELIMITERS 9 /* extra: a special kind of newline */ #define SPECIAL_DELIMITERS { '.', ',', ':', '-', ';', '!', '"', '\'', '\n'} #define BEGIN_SPECIAL_DELIMITERS BEGIN_SPECIAL_CHARS #define EASY_END_SPECIAL_DELIMITERS 9 #define HARD_END_SPECIAL_DELIMITERS 10 /* Special separators are things that can separate two words: they are 2blanks, 2tabs or 2newlines */ #define NUM_SEPARATORS 7 /* numbered from 10 .. 16 */ #define NEWLINE '\n' /* = HARD_END_SPECIAL_DELIMITERS --> carefully chosen so that this is TRUE !!!! Speeds up searches */ #define NOTBLANK (NEWLINE + 1) /* acts like unputc(' ') if char after a word != blk OR sp-delims */ #define BLANK (NOTBLANK + 1) #define TAB (NOTBLANK + 2) #define TWOBLANKS (NOTBLANK + 3) /* Beginning of a sentence */ #define TWOTABS (NOTBLANK + 4) /* Indentation */ #define TWONEWLINES (NOTBLANK + 5) /* Beginning of a paragraph */ #define BEGIN_SEPARATORS 10 #define END_SEPARATORS 17 /* * An alternate way would be to have a code for BLANK and NBLANKS, TAB and NTABS, and, NEWLINE and NNEWLINES: * in each of these cases, the byte occuring immediately next would determine the number of BLANKS/TABS/NEWLINES. * Though this works for a general number of cases, it needs two bytes of encoding: which makes us * wonder whether those cases occur commonly enough to waste two bytes to encode two blanks (common). * The present encoding guarantees 50% compression for any sequence of separators anyway, and is much simpler. */ /* Special texts are text-sequences which have a 1 byte codes associated with them: these appear first among the special things */ #define NUM_SPECIAL_TEXTS 13 /* numbered from 17 .. 29 */ #define SPECIAL_TEXTS { '.', ',', ':', '-', ';', '!', '"', '\'', '#', '$', '%', '(', ')'} /* Could have used ?, @ and & too */ #define BEGIN_SPECIAL_TEXTS 17 #define END_SPECIAL_TEXTS 30 /* Characters for literal text */ #define BEGIN_VERBATIM 30 #define END_VERBATIM 31 #define EASY_ONE_VERBATIM EASY_END_SPECIAL_DELIMITERS #define HARD_ONE_VERBATIM BEGIN_VERBATIM /* Is not an ascii char since ascii is 32.. */ /* BEGIN and END SPECIAL_WORDS are variables */ #if 0 /* THIS WON'T REALLY HELP SINCE SOURCE CODE RARELY HAS COMMON WORDS: KEYWORDS ARE VERY SMALL SO THEY HARDLY GIVE ANY COMPRESSION */ char special_program_chars[] = { '.', ',', ':', '-', '!', ';', '?', '+', '/', '\'', '"', '~', '`', '&', '@', '#', '$', '%', '^', '*', '=', '(', ')', '{', '}', '[', ']', '_', '|', '\\', '<', '>' }; #endif /*0*/ /* * Common exported functions. */ unsigned short encode_index(); unsigned short decode_index(); unsigned int mygetc(); int is_little_endian(); int build_string(); int build_hash(); int dump_hash(); int dump_string(); int get_word_from_offset(); int dump_and_free_string_hash(); hash_entry *insert_hash(); hash_entry *get_hash(); int hash_it(); /* * The beauty of this allocation scheme is that "free" does not need to be implemented! * The total memory occupied by both the string and hash tables is appx 1.5 MB */ #define hashfree(h) if (usemalloc) free(e); #define hashalloc(e) \ {\ if (usemalloc) (e) = (hash_entry *)malloc(sizeof(hash_entry));\ else {\ if (free_hash == NULL) free_hash = (hash_entry *)malloc(sizeof(hash_entry) * DEF_MAX_WORDS);\ if (free_hash == NULL) (e) = NULL;\ else (e) = ((next_free_hash >= DEF_MAX_WORDS) ? (NULL) : (&(free_hash[next_free_hash ++])));\ }\ if ((e) == NULL) {fprintf(stderr, "Out of memory in cast-hash-table!\n"); exit(2); }\ } #define strfree(s) if (usemalloc) free(s); /* called ONLY in the build procedure in which we can afford to be slow and do an strcpy since sizes of words are not determined: hardcoded in build_hash() */ #define stralloc(s, len) \ {\ if (usemalloc) (s) = (char *)malloc(len);\ else {\ if (free_str == NULL) free_str = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS);\ if (free_str == NULL) (s) = NULL;\ else (s) = ((next_free_str >= AVG_WORD_LEN * DEF_MAX_WORDS) ? (NULL) : (&(free_str[next_free_str]))); next_free_str += (len);\ }\ if ((s) == NULL) {fprintf(stderr, "Out of memory in cast-string-table!\n"); exit(2); }\ } /* There is no equivalent strtablealloc since it is hardcoded into build_string and is not used anywhere else */ /* Some flags corr. to user options: avoid global variables for options, pass flags as parameters */ #define TC_EASYSEARCH 0x1 #define TC_UNTILNEWLINE 0x2 #define TC_REMOVE 0x4 #define TC_OVERWRITE 0x8 #define TC_RECURSIVE 0x10 #define TC_ERRORMSGS 0x20 #define TC_SILENT 0x40 #define TC_NOPROMPT 0x80 #define TC_FILENAMESONSTDIN 0x100 #define CAST_VERSION "1.0" #define CAST_DATE "1994" #endif /*_DEFS_H_*/